`
Load the tweets and check if they are loaded correctly
# Set working directory
# getwd()
# setwd("./data/")
# Load data
load("../data/Tweets_all.rda")
# Check that tweets are loaded
head(tweets)
## # A tibble: 6 × 14
## created_at id id_str full_text in_reply_to_screen_n…¹
## <dttm> <dbl> <chr> <chr> <chr>
## 1 2023-01-20 17:17:32 1.62e18 1616469988369469… "Im MSc … <NA>
## 2 2023-01-13 07:52:01 1.61e18 1613790954737074… "Was bew… <NA>
## 3 2023-01-12 19:30:01 1.61e18 1613604227141537… "Was uns… <NA>
## 4 2023-01-12 08:23:00 1.61e18 1613436367169634… "Eine di… <NA>
## 5 2023-01-11 14:00:05 1.61e18 1613158809081450… "Wir gra… <NA>
## 6 2023-01-10 17:06:11 1.61e18 1612843252083834… "Unsere … <NA>
## # ℹ abbreviated name: ¹in_reply_to_screen_name
## # ℹ 9 more variables: retweet_count <int>, favorite_count <int>, lang <chr>,
## # university <chr>, tweet_date <dttm>, tweet_minute <dttm>,
## # tweet_hour <dttm>, tweet_month <date>, timeofday_hour <chr>
summary(tweets)
## created_at id id_str
## Min. :2009-09-29 14:29:47.0 Min. :4.469e+09 Length:19575
## 1st Qu.:2015-01-28 15:07:41.5 1st Qu.:5.604e+17 Class :character
## Median :2018-04-13 13:26:56.0 Median :9.848e+17 Mode :character
## Mean :2017-12-09 15:26:50.7 Mean :9.400e+17
## 3rd Qu.:2020-10-20 10:34:50.0 3rd Qu.:1.318e+18
## Max. :2023-01-26 14:49:31.0 Max. :1.619e+18
## full_text in_reply_to_screen_name retweet_count favorite_count
## Length:19575 Length:19575 Min. : 0.000 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.000 1st Qu.: 0.00
## Mode :character Mode :character Median : 1.000 Median : 0.00
## Mean : 1.289 Mean : 1.37
## 3rd Qu.: 2.000 3rd Qu.: 2.00
## Max. :267.000 Max. :188.00
## lang university tweet_date
## Length:19575 Length:19575 Min. :2009-09-29 00:00:00.00
## Class :character Class :character 1st Qu.:2015-01-28 00:00:00.00
## Mode :character Mode :character Median :2018-04-13 00:00:00.00
## Mean :2017-12-09 02:25:45.00
## 3rd Qu.:2020-10-20 00:00:00.00
## Max. :2023-01-26 00:00:00.00
## tweet_minute tweet_hour
## Min. :2009-09-29 14:29:00.00 Min. :2009-09-29 14:00:00.00
## 1st Qu.:2015-01-28 15:07:00.00 1st Qu.:2015-01-28 14:30:00.00
## Median :2018-04-13 13:26:00.00 Median :2018-04-13 13:00:00.00
## Mean :2017-12-09 15:26:24.68 Mean :2017-12-09 14:59:43.81
## 3rd Qu.:2020-10-20 10:34:30.00 3rd Qu.:2020-10-20 10:00:00.00
## Max. :2023-01-26 14:49:00.00 Max. :2023-01-26 14:00:00.00
## tweet_month timeofday_hour
## Min. :2009-09-01 Length:19575
## 1st Qu.:2015-01-01 Class :character
## Median :2018-04-01 Mode :character
## Mean :2017-11-24
## 3rd Qu.:2020-10-01
## Max. :2023-01-01
Start preprocessing the tweets, to calculate the intervalls some additional properties are needed. We also extract the emojis from the tweets. So the emojis are written as text and can be analyzed in the wordcloud better. We also detect the language of the tweets to make separate wordclouds.
# Preprocessing Step: Convert date and time to POSIXct and format according to date, year and university
tweets <- tweets %>%
mutate(
created_at = as.POSIXct(created_at, format = "%Y-%m-%d %H:%M:%S"),
date = as.Date(created_at),
day = weekdays(created_at),
day = factor(day, levels = c(
"Monday", "Tuesday",
"Wednesday", "Thursday", "Friday", "Saturday", "Sunday"
)),
year = year(created_at),
university = as.character(university),
language = detect_language(full_text),
full_text_emojis = replace_emoji(full_text, emoji_dt = lexicon::hash_emojis)
)
# Remove Emoji Tags helper funciton
remove_emoji_tags <- function(text) {
str_remove_all(text, "<[a-z0-9]{2}>")
}
tweets$full_text_emojis <- sapply(tweets$full_text_emojis, remove_emoji_tags)
# Store emojis in a sep arate column to analyze later
tweets$emoji_unicode <- tweets %>%
emoji_extract_nest(full_text) %>%
select(.emoji_unicode)
# Count each tweet by university and hour of the day
tweet_counts_by_hour_of_day <- tweets %>%
group_by(university, timeofday_hour) %>%
count() %>%
arrange(university, timeofday_hour)
# Plot the number of tweets by university and hour of the day
ggplot(
tweet_counts_by_hour_of_day,
aes(
x = timeofday_hour, y = n,
color = university, group = university
)
) +
geom_line() +
facet_wrap(~university) +
labs(
title = "Number of tweets by university and hour",
x = "Hour of day",
y = "Number of tweets"
)
# Show most active hours for each university
hours_with_most_tweets_by_uni <- tweet_counts_by_hour_of_day %>%
group_by(university, timeofday_hour) %>%
summarize(total_tweets = sum(n)) %>%
group_by(university) %>%
slice_max(n = 1, order_by = total_tweets)
print(hours_with_most_tweets_by_uni)
## # A tibble: 8 × 3
## # Groups: university [8]
## university timeofday_hour total_tweets
## <chr> <chr> <int>
## 1 FHNW 09 344
## 2 FH_Graubuenden 11 493
## 3 ZHAW 17 580
## 4 bfh 08 497
## 5 hes_so 10 315
## 6 hslu 09 380
## 7 ost_fh 08 44
## 8 supsi_ch 11 330
# Show most active hour overall
hour_with_most_tweets <- tweet_counts_by_hour_of_day %>%
group_by(timeofday_hour) %>%
summarize(total_tweets = sum(n)) %>%
arrange(desc(total_tweets)) %>%
slice(1)
print(hour_with_most_tweets)
## # A tibble: 1 × 2
## timeofday_hour total_tweets
## <chr> <int>
## 1 11 2356
# Count each tweet by university and weekday
tweet_counts_by_week_day <- tweets %>%
group_by(university, day) %>%
count() %>%
arrange(university, day)
# Plot the number of tweets by university and day of the week
ggplot(tweet_counts_by_week_day, aes(
x = day,
y = n, color = university,
group = university
)) +
geom_line() +
facet_wrap(~university) +
labs(
title = "Number of tweets by university and day of the week",
x = "Day of the week", y = "Number of tweets"
)
# Show most active days for each university
days_with_most_tweets_by_uni <- tweet_counts_by_week_day %>%
group_by(university, day) %>%
summarize(total_tweets = sum(n)) %>%
group_by(university) %>%
slice_max(n = 1, order_by = total_tweets)
print(days_with_most_tweets_by_uni)
## # A tibble: 8 × 3
## # Groups: university [8]
## university day total_tweets
## <chr> <fct> <int>
## 1 FHNW Tuesday 575
## 2 FH_Graubuenden Tuesday 751
## 3 ZHAW Wednesday 636
## 4 bfh Tuesday 651
## 5 hes_so Tuesday 415
## 6 hslu Thursday 603
## 7 ost_fh Friday 65
## 8 supsi_ch Friday 461
# Calculate time intervals between tweets
find_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
tweets <- tweets %>%
arrange(university, created_at) %>%
group_by(university) %>%
mutate(time_interval = as.numeric(
difftime(created_at, lag(created_at), units = "mins")
))
# Descriptive statistics of time intervals
summary(tweets$time_interval)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 148.2 1128.8 2097.6 2428.3 220707.0 8
# setwd("../4.Text-Mining-Groupwork/plots")
unique_years <- tweets$year %>% unique()
# Plot distribution of time intervals between tweets for each year
# for (curr_year in unique_years) {
# # Filter data for the specific year
# filtered_data <- tweets %>%
# filter(year(created_at) == curr_year)
# print(ggplot(filtered_data, aes(x = time_interval)) +
# geom_histogram(fill = "lightblue") +
# facet_wrap(~university) +
# labs(
# title = paste0(
# "Distribution of time intervals between tweets - ", curr_year
# ),
# x = "Time interval (minutes)",
# y = "Tweet count"
# ))
# universities <- filtered_data$university %>% unique()
# for (uni in universities) {
# # Filter data for the specific university
# uni_filtered_data <- filtered_data %>%
# filter(university == uni)
# print(ggplot(uni_filtered_data, aes(x = time_interval)) +
# geom_histogram(fill = "lightblue") +
# labs(
# title = paste0(
# "Distribution of time intervals between tweets for ", uni,
# " in ", curr_year
# ),
# x = "Time interval (minutes)",
# y = "Tweet count"
# ))
# # Calculate mode (most common interval) in hours
# most_common_interval_minutes <- find_mode(uni_filtered_data$time_interval)
# most_common_interval_hours <- most_common_interval_minutes / 60
# print(paste0(
# "Most common time interval for ", uni,
# " in ",
# curr_year,
# " is ", most_common_interval_minutes,
# " minutes (", most_common_interval_hours, " hours)"
# ))
# }
# }
tweets_corpus <- corpus(tweets, text_field = "full_text_emojis")
# Removes: URLS, Punctuation, Numbers
tokens <- tokens(tweets_corpus,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE
)
# Remove 'amp' as it is not meaningful because its only & symbol
# Remove 'rt' because it is an word e.g 'engagiert'.
# TODO: Check if languages are correct
extended_stopwords <- c(
stopwords("en"),
stopwords("fr"),
stopwords("de"),
stopwords("it"),
"#fhnw", "#bfh", "@htw_chur", "#hslu", "#supsi",
"amp", "rt", "fr", "ber"
)
# transform to lowercase
tokens <- tokens_tolower(tokens)
# Stem all words
tokens <- tokens_wordstem(tokens)
# Create n-grams of any length (including bigrams and trigrams)
tokens <- tokens_ngrams(tokens, n = 1)
# remove stopwords im multiple languages and remove university hashtags
tokens <- quanteda::tokens_select(tokens,
pattern = extended_stopwords,
selection = "remove"
)
# Create Document-feature-matrix
doc_matrix <- dfm(tokens)
# Word Frequencies
word_freqs <- doc_matrix %>%
colSums() %>%
sort(decreasing = TRUE)
# Top 20 words
head(word_freqs, 20)
## mehr neue right heut statt bfh studi
## 1099 811 709 701 614 608 581
## neuen thema hes-so findet knnen hochschul schweizer
## 536 533 533 527 518 507 504
## schweiz projekt arrow zeigt studierend gibt
## 489 463 441 433 429 428
word_freqs_df <- data.frame(
word = featnames(doc_matrix),
freq = colSums(doc_matrix)
)
# Create the word cloud
set.seed(123)
wordcloud(
words = word_freqs_df$word,
freq = word_freqs_df$freq,
min.freq = 5,
max.words = 100,
random.order = FALSE,
rot.per = 0.35,
colors = brewer.pal(8, "Dark2")
)
# TODO: Wordcloud per University
# TODO: Add retweet and other stuff
# Identify the tweets with the most likes
most_liked_tweets <- tweets %>%
arrange(desc(favorite_count)) %>%
head(1000)
# Analyze the posting time of the most liked tweets
most_liked_tweets_time <- most_liked_tweets %>%
mutate(time_of_day = format(created_at, "%H"))
# Plot the distribution of the posting times
ggplot(most_liked_tweets_time, aes(x = as.numeric(time_of_day))) +
geom_histogram(binwidth = 1, fill = "lightblue", color = "blue") +
labs(
title = "Distribution of Posting Times for Most Liked Tweets",
x = "Hour of Day",
y = "Frequency"
)
Analyse the content of the most liked tweets
# Preprocessing content of most liked tweets
most_liked_tokens <- tokens(most_liked_tweets$full_text,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE
)
# Apply the same stopword removal and transformations
most_liked_tokens <- tokens_tolower(most_liked_tokens)
most_liked_tokens <- tokens_wordstem(most_liked_tokens)
most_liked_tokens <- tokens_ngrams(most_liked_tokens, n = 1:3)
most_liked_tokens <- tokens_select(most_liked_tokens,
pattern = extended_stopwords, selection = "remove"
)
most_liked_doc_matrix <- dfm(most_liked_tokens)
# Word Frequencies of Most Liked Tweets
most_liked_word_freqs <- most_liked_doc_matrix %>%
colSums() %>%
sort(decreasing = TRUE)
# Create a word cloud for most liked tweets
most_liked_word_freqs_df <- data.frame(
word = featnames(most_liked_doc_matrix),
freq = colSums(most_liked_doc_matrix)
)
set.seed(123)
wordcloud(
words = most_liked_word_freqs_df$word,
freq = most_liked_word_freqs_df$freq,
min.freq = 2,
max.words = 100,
random.order = FALSE,
rot.per = 0.35,
colors = brewer.pal(8, "Dark2")
)
# TODO: Per university
# for (uni in unique(tweets$university)) {
# Filter tweets by university
uni_tweets <- tweets %>%
filter(university == "bfh")
uni_tokens <- tokens(uni_tweets$full_text,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE
)
# transform to lowercase
uni_tokens <- tokens_tolower(uni_tokens)
# Stem all words
uni_tokens <- tokens_wordstem(uni_tokens)
# Create n-grams of any length (including bigrams and trigrams)
uni_tokens <- tokens_ngrams(uni_tokens, n = 1)
uni_tokens <- tokens_select(uni_tokens,
pattern = extended_stopwords, selection = "remove"
)
uni_dfm <- dfm(uni_tokens)
uni_word_freqs_df <- data.frame(
word = featnames(uni_dfm),
freq = colSums(uni_dfm)
)
# Create the word cloud
set.seed(123)
wordcloud(
words = uni_word_freqs_df$word,
freq = uni_word_freqs_df$freq,
min.freq = 5,
random.order = FALSE,
rot.per = 0.35,
max.words = 100,
colors = brewer.pal(8, "Dark2")
)
library(wordcloud2)
wordcloud2(uni_word_freqs_df, size = 0.5)
# TODO: Save wordcloud
# Analyze Top Emojis by University
emoji_count_per_university <- uni_tweets %>%
top_n_emojis(full_text)
emoji_count_per_university %>%
mutate(emoji_name = reorder(emoji_name, n)) %>%
ggplot(aes(n, emoji_name)) +
geom_col() +
labs(x = "Count", y = NULL, title = "Top 20 Emojis Used")
# }
bi_gram_tokens <- tokens_ngrams(tokens, n = 2)
bi_gram_matrix <- dfm(bi_gram_tokens)
bi_gram_freqs_df <- data.frame(
word = featnames(bi_gram_matrix),
freq = colSums(bi_gram_matrix)
)
# Create the bigram word cloud
set.seed(123) # For reproducibility
wordcloud(
words = bi_gram_freqs_df$word,
freq = bi_gram_freqs_df$freq,
min.freq = 3,
max.words = 100,
random.order = FALSE,
rot.per = 0.35,
colors = brewer.pal(8, "Accent")
)
# Trigram Wordcloud
tri_gram_tokens <- tokens_ngrams(tokens, n = 3)
tri_gram_matrix <- dfm(tri_gram_tokens)
tri_gram_freqs_df <- data.frame(
word = featnames(tri_gram_matrix),
freq = colSums(tri_gram_matrix)
)
# Create the trigram word cloud
set.seed(123) # For reproducibility
wordcloud(
words = tri_gram_freqs_df$word,
freq = tri_gram_freqs_df$freq,
min.freq = 2,
max.words = 100,
random.order = FALSE,
rot.per = 0,
colors = brewer.pal(8, "Paired")
)
### LDA Topic Modeling
new.dfm <- dfm_subset(doc_matrix, ntoken(doc_matrix) > 0) # löscht alle Reihen mit nur 0s
tweet_lda <- LDA(new.dfm, k = 5, control = list(seed = 123))
# Tidy the LDA results
topic_terms <- tidy(tweet_lda, matrix = "beta")
# Extract topics and top terms
topics <- as.data.frame(terms(tweet_lda, 50)) # First fifty words per topic
# Tidy the LDA results
tweet_lda_td <- tidy(tweet_lda)
# Extract top terms per topic
top_terms <- tweet_lda_td %>%
group_by(topic) %>%
top_n(8, beta) %>% # Show top 8 terms per topic
ungroup() %>%
arrange(topic, -beta)
# Visualize top terms per topic
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~topic, scales = "free") +
scale_y_reordered() +
labs(
x = "Beta (Term Importance within Topic)",
y = NULL,
title = "Top Terms per Topic in Tweets (LDA)"
)
# Most different words among topics (using log ratios)
diff <- tweet_lda_td %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001 | topic3 > .001) %>%
mutate(
logratio_t1t2 = log2(topic2 / topic1),
logratio_t1t3 = log2(topic3 / topic1),
logratio_t2t3 = log2(topic3 / topic2)
)
diff
## # A tibble: 171 × 9
## term topic1 topic2 topic3 topic4 topic5 logratio_t1t2 logratio_t1t3
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 @bfh_hesb 1.66e-3 9.62e-4 1.86e-3 3.45e-4 3.41e-4 -0.790 0.157
## 2 @enginee… 5.06e-4 1.57e-3 1.56e-3 1.10e-3 4.09e-4 1.63 1.63
## 3 @fhnw 2.70e-4 1.53e-3 3.34e-4 1.61e-3 1.05e-3 2.50 0.305
## 4 @fhnwbusi 1.63e-4 2.15e-3 2.31e-3 3.13e-4 2.98e-3 3.72 3.83
## 5 @fhnwtec… 1.87e-3 4.11e-5 7.78e-4 1.02e-3 6.08e-4 -5.51 -1.27
## 6 @hes_so 1.06e-3 6.60e-4 9.32e-4 6.51e-4 4.86e-4 -0.678 -0.179
## 7 @hsafhnw 1.27e-3 8.45e-4 4.57e-4 1.14e-3 1.73e-3 -0.584 -1.47
## 8 @hslu 1.44e-3 4.08e-3 2.08e-3 6.85e-4 2.08e-4 1.50 0.525
## 9 @supsi_ch 8.45e-4 1.36e-3 5.18e-4 1.46e-4 2.96e-5 0.681 -0.705
## 10 @zhaw 6.62e-4 2.68e-3 3.90e-4 1.47e-3 8.58e-4 2.02 -0.763
## # ℹ 161 more rows
## # ℹ 1 more variable: logratio_t2t3 <dbl>
# Add topic probabilities to original data
# lda_gamma <- tidy(tweet_lda, matrix = "gamma")
# tweets <- tweets %>%
# mutate(document_id = row_number()) %>% # Add a unique ID for each tweet
# left_join(lda_gamma, by = c("document_id" = "document"))
# Analyze topics by university
# tweets %>%
# count(university, topic, wt = gamma) %>%
# group_by(university) %>%
# slice_max(n = 3, order_by = n, with_ties = FALSE) %>%
# ungroup() %>%
# mutate(topic = paste0("Topic ", topic)) %>%
# ggplot(aes(x = university, y = n, fill = topic)) +
# geom_col(position = "dodge") +
# labs(title = "Top 3 Topics by University", y = "Topic Proportion")
tweets %>%
mutate(tweet_length = nchar(full_text)) %>%
ggplot(aes(x = tweet_length)) +
geom_histogram() +
labs(title = "Distribution of Tweet Lengths")
### Sentiment Analysis
# Calculate Sentiment for Supported Languages Only
langs <- c("de", "fr", "it", "en")
tweets_filtered <- tweets %>%
filter(language %in% langs)
# Create Function to Get Syuzhet Sentiment
get_syuzhet_sentiment <- function(text, lang) {
# Check if language is supported
if (lang %in% langs) {
return(get_sentiment(text, method = "syuzhet", lang = lang))
} else {
return(NA) # Return NA for unsupported languages
}
}
# Calculate Syuzhet Sentiment for each Tweet
tweets_filtered$sentiment <-
mapply(get_syuzhet_sentiment, tweets_filtered$full_text, tweets_filtered$lang)
# Sentiment over Time
tweets_filtered$month <- floor_date(tweets_filtered$created_at, "month")
plot_data <- tweets_filtered %>%
group_by(university, month) %>%
summarize(mean_sentiment_syuzhet = mean(sentiment, na.rm = TRUE))
# Plot Syuzhet Sentiment by all Universities
ggplot(plot_data, aes(
x = month,
y = mean_sentiment_syuzhet,
color = university, group = university
)) +
geom_line() +
labs(
title = "Mean Syuzhet Sentiment Over Time by University",
y = "Mean Sentiment Score"
) +
scale_x_datetime(date_breaks = "1 month", date_labels = "%Y-%m") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# TODO: Check sentiment and use syuzhet
# for (uni in unique(tweets$university)) {
uni <- "bfh"
uni_tweets <- tweets %>%
filter(university == uni, lang %in% langs)
uni_tweets$sentiment <-
mapply(get_syuzhet_sentiment, uni_tweets$full_text, uni_tweets$lang)
# Sentiment Over Time (Per University)
uni_tweets$month <- floor_date(uni_tweets$created_at, "month")
uni_tweets$year <- year(uni_tweets$month)
plot_data <- uni_tweets %>%
group_by(year, month) %>%
summarize(mean_sentiment = mean(sentiment, na.rm = TRUE))
# Plot Syuzhet Sentiment Over Time (Per University)
print(ggplot(plot_data, aes(x = month, y = mean_sentiment)) +
geom_line(aes(color = as.factor(year))) +
labs(
title = paste0("Mean Syuzhet Sentiment Over Time by - ", uni),
y = "Mean Sentiment Score"
) +
scale_x_datetime(date_breaks = "1 month", date_labels = "%Y-%m") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_wrap(~year, scales = "free_x"))
uni_tweets <- tweets %>%
filter(university == uni, language %in% langs)
# Tokenize and Preprocess Words
uni_words <- tweets %>%
unnest_tokens(word, full_text) %>%
anti_join(stop_words) # remove Stopwords
# Join Sentiment with Words
sentiment_words <- uni_words %>%
inner_join(get_sentiments("bing"), by = "word")
# Separate Positive and Negative Words
positive_words <- sentiment_words %>%
filter(sentiment == "positive") %>%
count(word, sort = TRUE)
negative_words <- sentiment_words %>%
filter(sentiment == "negative") %>%
count(word, sort = TRUE)
# Create and Display Word Clouds
par(mfrow = c(1, 2)) # Set up side-by-side plots
wordcloud(
words = positive_words$word,
freq = positive_words$n,
random.order = FALSE,
rot.per = 0.35,
colors = brewer.pal(8, "Greens")
)
title(main = paste("Positive Words for", uni), line = 2)
wordcloud(
words = negative_words$word,
freq = negative_words$n,
random.order = FALSE,
rot.per = 0.35,
colors = brewer.pal(8, "Reds")
)
title(main = paste("Negative Words for", uni), line = 2)
# }
# Language Analysis
tweets %>%
count(lang) %>%
arrange(desc(n))
## # A tibble: 99 × 3
## # Groups: university [8]
## university lang n
## <chr> <chr> <int>
## 1 bfh de 3008
## 2 hslu de 2988
## 3 ZHAW de 2941
## 4 FH_Graubuenden de 2677
## 5 FHNW de 2570
## 6 supsi_ch it 1849
## 7 hes_so fr 1716
## 8 supsi_ch en 260
## 9 ost_fh de 259
## 10 FH_Graubuenden en 228
## # ℹ 89 more rows
# Emoji Analysis
emoji_count <- tweets %>%
top_n_emojis(full_text)
emoji_count %>%
mutate(emoji_name = reorder(emoji_name, n)) %>%
ggplot(aes(n, emoji_name)) +
geom_col() +
labs(x = "Count", y = NULL, title = "Top 20 Emojis Used")
# insights <- list(
# "Most Active Hours" = hours_with_most_tweets_by_uni,
# "Most Active Days" = days_with_most_tweets_by_uni,
# "Most Common Time Intervals" = most_common_interval_minutes,
# "Content Analysis" = head(word_freqs),
# "Sentiment Analysis" = tweet_sentiment
# )